See the source for details
RAD_data_dir <- "~/Documents/Grunwald/short-scripts/Genotype_error/"
library('poppr')
## Loading required package: adegenet
## Loading required package: ade4
## ==========================
## adegenet 1.4-2 is loaded
## ==========================
##
## - to start, type '?adegenet'
## - to browse adegenet website, type 'adegenetWeb()'
## - to post questions/comments: adegenet-forum@lists.r-forge.r-project.org
##
##
## This is poppr version 1.1.4.99.125. To get started, type package?poppr
library('ape')
library('phangorn')
library('animation')
infdat <- RCurl::getURL("https://raw.githubusercontent.com/grunwaldlab/phytophthora_id/master/shiny-server/www/genoid_infestans/reduced_database.txt.csv")
infdat <- read.table(text = infdat, head = TRUE)
pinf <- df2genind(infdat[-c(1,2)], sep = "/", ploidy = 3, ind.names = infdat[[1]], pop = infdat[[2]])
ssr <- c(3,3,2,3,3,2,2,3,3,3,3,3)
x <- as.genclone(pinf)
fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))
# #
# tiff(filename = "images/pinf_cluster.tiff", width = 85, height = 68, units = "mm", res = 1200, pointsize = 6)
# fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
# title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))
# dev.off()
# legend("topright", legend = c("Nearest Neighbor", "UPGMA", "Farthest Neighbor"),
# col = c("red", "black", "blue"), pch = 1, title = "Clustering Method")
The plot above shows how multilocus genotypes collapse under differing algorithms over genetic distance.
Below, we will collapse MLGs with a threshold of an average of 2 mutational steps over all loci and create contingency tables relating the clustered MLGs to the previously defined MLGs (eg. US-8).
z <- filter_stats(x, bruvo.dist, replen = ssr, loss = FALSE, threshold = 0.75/12, stats = "MLGS")
print(table(pop(x), z$farthest), zero.print = ".")
##
## 1 3 5 8 9 10 12 13 15 17 19 20 22 23 24 25 27 28 29
## B . . . . . . . . 1 . . . . . . . . . .
## C . 1 . . . . . . . . . . . . . . . . .
## EU-13 . . . . 1 . . . . . . . . . . . . . .
## EU-4 . . . . . 1 . . . . . . . . . . . . .
## EU-8 . . . . . . . 1 . . . . . . . . . . .
## EU-5 . . . . . . 2 . . . . . . . . . . . .
## US-8 . . . . . . . . . . . 1 . . 2 1 . . .
## D.1 1 . . . . . . . . . . . . . . . . . .
## D.2 1 . . . . . . . . . . . . . . . . . .
## US-11 . . . . . . . . . 2 . . . . . . . . .
## US-12 . . 1 . . . . . . . . . . . . . . . .
## US-14 . . . . . . . . . . . . 1 . . . . . .
## US-17 . . . . . . . . . . . . . . . . . 1 .
## US-20 . . . . . . . . . . . . . . . . 2 . .
## US-21 . . . . . . . . . . 2 . . . . . . . .
## US-22 . . . . . . . . . . . . . . . . . . 2
## US-23 . . . 3 . . . . . . . . . . . . . . .
## US-24 . . . . . . . . . . . . . 3 . . . . .
print(table(pop(x), z$nearest), zero.print = ".")
##
## 1 3 5 6 9 10 12 13 15 17 19 20 23 24 25 27 28 29
## B . . . . . . . . 1 . . . . . . . . .
## C . 1 . . . . . . . . . . . . . . . .
## EU-13 . . . . 1 . . . . . . . . . . . . .
## EU-4 . . . . . 1 . . . . . . . . . . . .
## EU-8 . . . . . . . 1 . . . . . . . . . .
## EU-5 . . . . . . 2 . . . . . . . . . . .
## US-8 . . . . . . . . . . . 1 . 2 1 . . .
## D.1 1 . . . . . . . . . . . . . . . . .
## D.2 1 . . . . . . . . . . . . . . . . .
## US-11 . . . . . . . . . 2 . . . . . . . .
## US-12 . . 1 . . . . . . . . . . . . . . .
## US-14 . . . . . . . . . . . . . 1 . . . .
## US-17 . . . . . . . . . . . . . . . . 1 .
## US-20 . . . . . . . . . . . . . . . 2 . .
## US-21 . . . . . . . . . . 2 . . . . . . .
## US-22 . . . . . . . . . . . . . . . . . 2
## US-23 . . . 3 . . . . . . . . . . . . . .
## US-24 . . . . . . . . . . . . 3 . . . . .
tab <- mlg.table(x, bar = FALSE)
colnames(tab) <- 1:ncol(tab)
print.table(tab, zero.print = ".") # contingency table for zero tolerance MLGs.
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## B . . . . . . . . . . . . . . 1 . . . . . . . . . . .
## C . . 1 . . . . . . . . . . . . . . . . . . . . . . .
## EU-13 . . . . . . . . 1 . . . . . . . . . . . . . . . . .
## EU-4 . . . . . . . . . 1 . . . . . . . . . . . . . . . .
## EU-8 . . . . . . . . . . . . 1 . . . . . . . . . . . . .
## EU-5 . . . . . . . . . . 1 1 . . . . . . . . . . . . . .
## US-8 . . . . . . . . . . . . . 1 . . . . . 1 . . . 1 1 .
## D.1 1 . . . . . . . . . . . . . . . . . . . . . . . . .
## D.2 . 1 . . . . . . . . . . . . . . . . . . . . . . . .
## US-11 . . . . . . . . . . . . . . . 1 1 . . . . . . . . .
## US-12 . . . . 1 . . . . . . . . . . . . . . . . . . . . .
## US-14 . . . . . . . . . . . . . . . . . . . . . 1 . . . .
## US-17 . . . . . . . . . . . . . . . . . . . . . . . . . .
## US-20 . . . . . . . . . . . . . . . . . . . . . . . . . 1
## US-21 . . . . . . . . . . . . . . . . . 1 1 . . . . . . .
## US-22 . . . 1 . . . . . . . . . . . . . . . . . . . . . .
## US-23 . . . . . 1 1 1 . . . . . . . . . . . . . . . . . .
## US-24 . . . . . . . . . . . . . . . . . . . . 1 . 2 . . .
## 27 28 29
## B . . .
## C . . .
## EU-13 . . .
## EU-4 . . .
## EU-8 . . .
## EU-5 . . .
## US-8 . . .
## D.1 . . .
## D.2 . . .
## US-11 . . .
## US-12 . . .
## US-14 . . .
## US-17 . 1 .
## US-20 1 . .
## US-21 . . .
## US-22 . . 1
## US-23 . . .
## US-24 . . .
Most of the MLGs were able to be resolved. US-8 and D.1 and D.2 are not exactly resolved, but that is shown in the trees produced from the commented code below.
# Uncomment to regenerate plots
#
# for (i in names(fstats)){
# HTML_collapse(measure = i, x = x, treefun = "upgma",
# distfun = "bruvo.dist", fstats = fstats, destdir = "images",
# replen = ssr, loss = FALSE)
# HTML_collapse(measure = i, x = x, treefun = "nj",
# distfun = "bruvo.dist", fstats = fstats, destdir = "images",
# replen = ssr, loss = FALSE)
# }
This will create 20 populations with 20 samples and 10k SNPs. Each population will have:
In addition, half of these populations will have undergone one generation of clonal reproduction.
set.seed(20150412)
x <- lapply(1:10, getSims, n = 20, snps = 1e3, strucrat = 1, ploidy = 2, err = 0.05, na.perc = 0.21, clone = TRUE, n.cores = 4)
## Loading required package: parallel
y <- lapply(1:10, getSims, n = 20, snps = 1e3, strucrat = 1, ploidy = 2, err = 0.05, na.perc = 0.21, clone = FALSE, n.cores = 4)
# x <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = TRUE, n.cores = 4)
# y <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = FALSE, n.cores = 4)
For analysis, \(\frac{1}{5}\)th of the pooled samples will be kept.
x <- do.call("rbind", c(x, y))
x <- x[sample(nInd(x), nInd(x)/5)]
trueclones <- duplicated(substr(indNames(x), start = 1, stop = 10))
fstats <- filter_stats(x, bitwise.dist, plot = TRUE, nclone = sum(!trueclones))
# the_threshold <- fstats$average$thresholds[sum(trueclones)] + .Machine$double.eps^0.5
the_threshold <- threshold_predictor(fstats$average$thresholds)
thresh <- duplicated(mlg.filter(x, distance = bitwise.dist,
threshold = the_threshold,
algo = "a"))
(threshtable <- table(thresh, trueclones))
## trueclones
## thresh FALSE TRUE
## FALSE 75 0
## TRUE 0 7
The tabulation is a power analysis of how many true and false positives there are when collapsing at the threshold that gives the same number of known clones/replicates.
Below is labelling a tree with known clones.
the_tree <- upgma(bitwise.dist(x))
clones <- substr(the_tree$tip.label[thresh], start = 1, stop = 10)
clones <- lapply(clones, grep, the_tree$tip.label)
edgelist <- length(which.edge(the_tree, the_tree$tip.label))
edgecols <- rep("black", edgelist)
for (i in clones){
edgecols[which.edge(the_tree, the_tree$tip.label[i])] <- "red"
}
plot.phylo(the_tree, edge.color = edgecols, adj = 0, label.offset = 0.001)
axisPhylo(1)
title("Random sequences with 1000 SNPs and a 0.21 error rate")
nreps <- 100
resarray <- array(data = integer(nreps*4), dim = c(2, 2, nreps),
dimnames = c(dimnames(threshtable), NULL))
neararray <- resarray
fararray <- resarray
avarray <- resarray
samplist <- lapply(1:nreps, function(x) list(samp = NULL, tree = NULL,
mlgs = NULL))
Sys.time()
## [1] "2015-04-15 12:47:49 PDT"
for (i in 1:nreps){
set.seed(i) # setting seed for accuracy.
snps <- rpois(1, 1e3)
samp1 <- lapply(1:10, getSims, n = 20, snps = snps, strucrat = 1, ploidy = 2,
err = 0.05, na.perc = 0.21, clone = TRUE, n.cores = 4)
samp2 <- lapply(1:10, getSims, n = 20, snps = snps, strucrat = 1, ploidy = 2,
err = 0.05, na.perc = 0.21, clone = FALSE, n.cores = 4)
samp <- do.call("rbind", c(samp1, samp2))
samp@ploidy <- rep(2L, nInd(samp))
samp <- samp[sample(nInd(samp), nInd(samp)/5)]
trueclones <- duplicated(substr(indNames(samp), start = 1, stop = 10))
fstats <- filter_stats(samp, bitwise.dist, plot = TRUE)
# the_threshold <- fstats$average$thresholds[sum(trueclones)] + .Machine$double.eps^0.5
title(paste("seed:", i, "n:", nInd(samp), "snps:", snps))
the_threshold <- threshold_predictor(fstats$average$thresholds)
the_distance <- bitwise.dist(x)
z <- filter_stats(x = samp, distance = bitwise.dist,
threshold = the_threshold, stats = "MLGs")
abline(v = the_threshold, lty = 2)
text(the_threshold, 0,
labels = paste("Threshold:", signif(the_threshold, 3)),
adj = 0)
samplist[[i]]$samp <- samp
samplist[[i]]$tree <- upgma(the_distance)
samplist[[i]]$mlgs <- z
athresh <- duplicated(z$average)
nthresh <- duplicated(z$nearest)
fthresh <- duplicated(z$farthest)
avarray[, , i] <- table(athresh, trueclones)
avarray[, , i] <- sweep(avarray[, , i], 2, colSums(avarray[, , i]), "/")
neararray[, , i] <- table(nthresh, trueclones)
neararray[, , i] <- sweep(neararray[, , i], 2, colSums(neararray[, , i]), "/")
fararray[, , i] <- table(fthresh, trueclones)
fararray[, , i] <- sweep(fararray[, , i], 2, colSums(fararray[, , i]), "/")
}
Sys.time()
## [1] "2015-04-15 12:52:57 PDT"
# color_mlg_tree(samp, upgma(bitwise.dist(samp)), z$average)
# axisPhylo(1)
Now we get to see how well we did.
(ares <- apply(avarray, 1:2, mean))
## trueclones
## thresh FALSE TRUE
## FALSE 0.97782633 0.02488684
## TRUE 0.02217367 0.97511316
(nres <- apply(neararray, 1:2, mean))
## trueclones
## thresh FALSE TRUE
## FALSE 0.97757633 0.02463684
## TRUE 0.02242367 0.97536316
(fres <- apply(fararray, 1:2, mean))
## trueclones
## thresh FALSE TRUE
## FALSE 0.97795133 0.02501184
## TRUE 0.02204867 0.97498816
| True Positive % | False Positive % | |
|---|---|---|
| farthest | 97.5 | 2.22 |
| average | 97.5 | 2.24 |
| nearest | 97.5 | 2.20 |
Note that this data has no reference and has a lot of error. There are 10 technical replicates. Each file represents a different parameter used for STACKS.
plinklist <- list(m3 = character(0), m4 = character(0), m10 = character(0), def = character(0))
plinklist[["m4"]] <- "2R/PopSamples/data.out/PopSamples_m4/Popsouts_Rselec/out.replicates/plink.raw"
plinklist[["def"]] <- "2R/PopSamples/data.out/PopSamples_def/Popsouts_Rselec/out.replicates/plink.raw"
plinklist[["m3"]] <- "2R/PopSamples/data.out/PopSamples_m3/Popsouts_Rselec/out.replicates/plink.raw"
plinklist[["m10"]] <- "2R/PopSamples/data.out/PopSamples_m10/Popsouts_Rselec/out.replicates/plink.raw"
contlist <- plinklist # Contingency tables
threshlist <- plinklist # Threshold stats
Steps: 1. read in data 2. mlg.filter on all algorithms and plot the thresholds. 3. create the contingency table for each output (using UPGMA method).
Note for each plot regarding the MLG filter:
The dotted lines represent the threshold at which the algorithms each creates 10 clusters.
for (i in names(plinklist)){
barb <- read.PLINK(paste(RAD_data_dir, plinklist[[i]], sep = "/"))
show(barb)
fstats <- filter_stats(barb, bitwise.dist, plot = TRUE, nclone = nInd(barb) - 10)
title(paste(i, "SNPS:", nLoc(barb)))
minthresh <- fstats$average$thresholds[10] + .Machine$double.eps^0.5
# nearthresh <- fstats$nearest$thresholds[10] + .Machine$double.eps^0.5
# farthresh <- fstats$farthest$thresholds[10] + .Machine$double.eps^0.5
# abline(v = minthresh, lty = 2)
# abline(v = nearthresh, lty = 2, col = "red")
# abline(v = farthresh, lty = 2, col = "blue")
# legend("topright", legend = c("Nearest Neighbor", "UPGMA", "Farthest Neighbor"),
# col = c("red", "black", "blue"), pch = 1, title = "Clustering Method")
thresh <- mlg.filter(barb, distance = bitwise.dist, algorithm = "a",
threshold = minthresh)
trueclones <- vapply(strsplit(indNames(barb), "_"), "[[", character(1), 1)
trueclones <- duplicated(trueclones)
thresh <- duplicated(thresh)
contlist[[i]] <- table(thresh, trueclones)
threshlist[[i]] <- fstats$average$thresholds
}
##
## Reading PLINK raw format into a genlight object...
##
##
## Reading loci information...
##
## Reading and converting genotypes...
## .
## Building final object...
##
## ...done.
##
## === S4 class genlight ===
## 77 genotypes, 11057 binary SNPs
## Ploidy: 2
## 170008 (0.2 %) missing data
## @pop: individual membership for 9 populations
## @loc.names: labels of the SNPs
## @other: a list containing: sex phenotype pat mat
##
## Reading PLINK raw format into a genlight object...
##
##
## Reading loci information...
##
## Reading and converting genotypes...
## .
## Building final object...
##
## ...done.
##
## === S4 class genlight ===
## 77 genotypes, 4353 binary SNPs
## Ploidy: 2
## 69547 (0.21 %) missing data
## @pop: individual membership for 9 populations
## @loc.names: labels of the SNPs
## @other: a list containing: sex phenotype pat mat
##
## Reading PLINK raw format into a genlight object...
##
##
## Reading loci information...
##
## Reading and converting genotypes...
## .
## Building final object...
##
## ...done.
##
## === S4 class genlight ===
## 78 genotypes, 502 binary SNPs
## Ploidy: 2
## 6359 (0.16 %) missing data
## @pop: individual membership for 9 populations
## @loc.names: labels of the SNPs
## @other: a list containing: sex phenotype pat mat
##
## Reading PLINK raw format into a genlight object...
##
##
## Reading loci information...
##
## Reading and converting genotypes...
## .
## Building final object...
##
## ...done.
##
## === S4 class genlight ===
## 77 genotypes, 7736 binary SNPs
## Ploidy: 2
## 126467 (0.21 %) missing data
## @pop: individual membership for 9 populations
## @loc.names: labels of the SNPs
## @other: a list containing: sex phenotype pat mat
Print the contingency tables and differences between threshold values to see if there is a large jump indicating a separation between replicates and independent samples.
print(contlist)
## $m3
## trueclones
## thresh FALSE TRUE
## FALSE 60 7
## TRUE 7 3
##
## $m4
## trueclones
## thresh FALSE TRUE
## FALSE 61 6
## TRUE 6 4
##
## $m10
## trueclones
## thresh FALSE TRUE
## FALSE 57 8
## TRUE 11 2
##
## $def
## trueclones
## thresh FALSE TRUE
## FALSE 61 6
## TRUE 6 4
for (i in threshlist){
plot(diff(i), log = "y")
}
## Warning in xy.coords(x, y, xlabel, ylabel, log): 2 y values <= 0 omitted
## from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 14 y values <= 0 omitted
## from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot
No such luck.
Now, we are plotting the tree where 10 samples are collapsed via average neighbor (UPGMA) and color the tips with the true duplicates.
Note about the figure: Tips are colored blue. Internal branches are colored black. If the algorithm found samples with a distance below the threshold, their connecting branches are colored red. The duplicated samples have red labels
defupgma <- phangorn::upgma(bitwise.dist(barb))
z <- filter_stats(barb, bitwise.dist, threshold = minthresh, stats = "MLGS")
barbnames <- vapply(strsplit(indNames(barb), "_"), "[[", character(1), 1)
dupes <- barbnames[duplicated(barbnames)]
thecols <- ifelse(barbnames %in% dupes, "red", "black")
color_mlg_tree(barb, defupgma, z$average, tip.color = thecols)
axisPhylo(1)